library(biomaRt)
library(limma)
library(affy)
library(preprocessCore)
library(UpSetR)
library(tidyverse)
library(RColorBrewer)
library(pheatmap)
library(edgeR)
library(DESeq2)
library(ggrepel)
library(pbapply)
library(viridis)
library(gplots)
library(msigdbr)
library(fgsea)
library(viridis)
library(UpSetR)
library(tximport)Analysis
Load packages
Data import
Import TPM count data from rsem .genes.results files:
# my code
directory_path <-
"~/OneDrive - ETH Zurich/CBB/FS1/Systems Genomics/sysgenom/data_new/gene_results/"
file_list <- list.files(directory_path)
expr <- data.frame(matrix(NA_integer_, nrow = 56884, ncol = 23))
names <- c()
for (i in file_list) {
names <- append(names, unlist(strsplit(i, ".genes.results")))
}
colnames(expr) <- names
for (i in 1:length(file_list)) {
file <- file.path(directory_path, file_list[i])
df <- read.csv(file, sep = "\t")
expr[, i] <- df$TPM
}
rownames(expr) <- df$gene_id
expr <- expr %>%
rownames_to_column(var = "gene_name") %>%
arrange(gene_name) %>%
column_to_rownames(var = "gene_name")
head(expr)# tutorial code
samples <- list.files("rsem")
expr2 <- sapply(samples, function(sample) {
file <- paste0("rsem/", sample, "/", sample, ".genes.results")
quant <- read.csv(file, sep = "\t", header = T)
tpm2 <- setNames(quant$TPM, quant$gene_id)
return(tpm2)
})
expr <- as.data.frame(expr2)
expr <- expr %>%
rownames_to_column(var = "gene_name") %>%
arrange(gene_name) %>%
column_to_rownames(var = "gene_name")
head(expr) SRR21355241 SRR21355242 SRR21355243 SRR21355246
ENSMUSG00000000001.5 19.49 21.47 20.63 16.42
ENSMUSG00000000003.16 0.00 0.00 0.00 0.00
ENSMUSG00000000028.16 1.36 1.21 1.43 0.52
ENSMUSG00000000031.19 0.00 0.16 0.00 0.50
ENSMUSG00000000037.18 0.00 0.07 0.18 0.15
ENSMUSG00000000049.12 1.05 1.06 1.65 0.33
SRR21355249 SRR21355250 SRR21355256 SRR21355360
ENSMUSG00000000001.5 19.74 16.93 21.91 19.17
ENSMUSG00000000003.16 0.00 0.00 0.00 0.00
ENSMUSG00000000028.16 1.57 0.45 1.48 1.29
ENSMUSG00000000031.19 0.00 0.00 0.14 0.08
ENSMUSG00000000037.18 0.05 0.21 0.26 0.12
ENSMUSG00000000049.12 1.00 0.68 1.29 0.71
SRR21355364 SRR21355366 SRR21355367 SRR21355374
ENSMUSG00000000001.5 19.85 19.81 22.34 23.04
ENSMUSG00000000003.16 0.00 0.00 0.00 0.00
ENSMUSG00000000028.16 1.26 0.74 1.37 1.26
ENSMUSG00000000031.19 0.00 0.22 0.00 0.00
ENSMUSG00000000037.18 0.00 0.05 0.23 0.12
ENSMUSG00000000049.12 1.19 1.29 0.45 1.11
SRR21355378 SRR21355380 SRR21355381 SRR21355382
ENSMUSG00000000001.5 23.86 17.59 16.23 19.77
ENSMUSG00000000003.16 0.00 0.00 0.00 0.00
ENSMUSG00000000028.16 1.58 0.38 1.21 1.98
ENSMUSG00000000031.19 0.16 0.00 0.00 0.10
ENSMUSG00000000037.18 0.09 0.33 0.18 0.17
ENSMUSG00000000049.12 1.22 0.74 1.49 1.55
SRR21355383 SRR21355696 SRR21355697 SRR21355698
ENSMUSG00000000001.5 20.20 23.96 20.37 24.05
ENSMUSG00000000003.16 0.00 0.00 0.00 0.00
ENSMUSG00000000028.16 0.92 2.01 1.40 1.21
ENSMUSG00000000031.19 0.00 0.00 0.00 0.00
ENSMUSG00000000037.18 0.12 0.14 0.04 0.15
ENSMUSG00000000049.12 0.79 1.39 1.17 0.00
SRR21355699 SRR21355703 SRR21355705
ENSMUSG00000000001.5 17.57 18.97 21.85
ENSMUSG00000000003.16 0.00 0.00 0.00
ENSMUSG00000000028.16 0.89 1.72 1.35
ENSMUSG00000000031.19 0.00 0.00 0.17
ENSMUSG00000000037.18 0.00 0.16 0.07
ENSMUSG00000000049.12 0.27 0.25 1.52
Both are the same!
Import annotation data for the samples:
# read in meta data for the runs (contains sample and mouse id)
anno <- read.csv("/Users/gonuni/Desktop/College/CBB/1st Semester/Systems Genomics/Group_project/SraRunTable.txt", sep = ",")
# read in meta data for the samples (contains mouse id and age)
metadata <-
read.csv("/Users/gonuni/Desktop/College/CBB/1st Semester/Systems Genomics/Group_project/BulkSeq_Aging_metadata.txt", sep = "\t")
# join information from anno and metadata to have age of mouse in each sample
age <- metadata %>%
group_by(mouseID) %>%
summarise(age)
ages <- c()
for (i in anno$mouse_id_number) {
ages <- append(ages, age$age[age$mouseID == i])
}
anno$age <- ages
anno <- anno %>%
select(Run, mouse_id_number, sex, age, everything())
anno <- anno[order(anno$Run),]
head(anno) Run mouse_id_number sex age Assay.Type AvgSpotLen Bases
7 SRR21355241 35 male 18 RNA-Seq 300 6477564300
8 SRR21355242 34 male 15 RNA-Seq 300 4162732500
9 SRR21355243 33 male 3 RNA-Seq 300 3252736800
10 SRR21355246 30 male 15 RNA-Seq 300 2338102800
11 SRR21355249 28 male 21 RNA-Seq 300 7603170300
12 SRR21355250 27 male 18 RNA-Seq 300 4418850300
BioProject BioSample Bytes
7 PRJNA875066 SAMN30602244 2380886548
8 PRJNA875066 SAMN30602245 1551355324
9 PRJNA875066 SAMN30602246 1224759082
10 PRJNA875066 SAMN30602249 879784428
11 PRJNA875066 SAMN30602252 2814959314
12 PRJNA875066 SAMN30602253 1611542706
Center.Name
7 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
8 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
9 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
10 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
11 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
12 TONY WYSS-CORAY, NEUROLOGY & NEUROLOGICAL SCIENCES, STANFORD MEDICINE
Consent DATASTORE.filetype DATASTORE.provider DATASTORE.region
7 public run.zq,fastq,sra s3,ncbi,gs s3.us-east-1,gs.US,ncbi.public
8 public sra,fastq,run.zq s3,gs,ncbi s3.us-east-1,gs.US,ncbi.public
9 public fastq,sra,run.zq s3,gs,ncbi gs.US,ncbi.public,s3.us-east-1
10 public run.zq,sra,fastq s3,gs,ncbi gs.US,s3.us-east-1,ncbi.public
11 public run.zq,sra,fastq s3,ncbi,gs gs.US,s3.us-east-1,ncbi.public
12 public run.zq,fastq,sra gs,ncbi,s3 gs.US,s3.us-east-1,ncbi.public
Experiment Instrument Library.Name LibraryLayout
7 SRX17360880 Illumina NovaSeq 6000 GSM6527217 PAIRED
8 SRX17360879 Illumina NovaSeq 6000 GSM6527216 PAIRED
9 SRX17360878 Illumina NovaSeq 6000 GSM6527215 PAIRED
10 SRX17360875 Illumina NovaSeq 6000 GSM6527211 PAIRED
11 SRX17360872 Illumina NovaSeq 6000 GSM6527208 PAIRED
12 SRX17360871 Illumina NovaSeq 6000 GSM6527207 PAIRED
LibrarySelection LibrarySource Organism Platform ReleaseDate
7 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
8 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
9 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
10 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
11 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
12 cDNA TRANSCRIPTOMIC Mus musculus ILLUMINA 2023-04-18T00:00:00Z
create_date version Sample.Name source_name SRA.Study strain
7 2022-09-01T02:54:00Z 1 GSM6527217 Motor cortex SRP394927 C57BL/6JN
8 2022-09-01T02:37:00Z 1 GSM6527216 Motor cortex SRP394927 C57BL/6JN
9 2022-09-01T02:37:00Z 1 GSM6527215 Motor cortex SRP394927 C57BL/6JN
10 2022-09-01T02:32:00Z 1 GSM6527211 Motor cortex SRP394927 C57BL/6JN
11 2022-09-01T03:09:00Z 1 GSM6527208 Motor cortex SRP394927 C57BL/6JN
12 2022-09-01T02:45:00Z 1 GSM6527207 Motor cortex SRP394927 C57BL/6JN
tissue
7 Motor cortex
8 Motor cortex
9 Motor cortex
10 Motor cortex
11 Motor cortex
12 Motor cortex
This is also the reason why we need multiple replicates of the each conditions; or the experiment would need to be designed in a way that samples from “different conditions” can be seen as “cross-replicates”, for instance, a reasonale number of samples all with different time points to study transcriptomic changes across the time course, so that we see the time course as a continuous condition rather than each individual time point as one distinct condition.
Import additional gene information:
devtools::install_version("dbplyr", version = "2.3.4") httr::set_config(httr::config(ssl_verifypeer = FALSE))
ensembl <- useEnsembl(biomart = "ensembl",
dataset = "mmusculus_gene_ensembl")
meta_genes <- getBM(
attributes = c(
"ensembl_gene_id",
"ensembl_gene_id_version",
"external_gene_name",
"description",
"chromosome_name",
"start_position",
"end_position",
"strand"
),
filters = "ensembl_gene_id_version",
values = rownames(expr),
mart = ensembl
) %>%
right_join(data.frame(ensembl_gene_id_version = rownames(expr)),
by = "ensembl_gene_id_version") %>%
distinct(ensembl_gene_id_version, .keep_all = TRUE)
expr <- expr[meta_genes$ensembl_gene_id_version, ]
rownames(meta_genes) <- meta_genes$ensembl_gene_id_versionExploratory data analysis
We have 56884 annotated genes in 23 samples.
ggplot(data = as.data.frame(avg_expr), mapping = aes(x = avg_expr)) +
geom_histogram(
color = "white",
fill = brewer.pal(n = 3, name = "Set1")[2],
bins = 50
) +
labs(title = "Distribution of average expression values of all genes",
x = "Average expression",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))ggplot(data = as.data.frame(avg_expr), mapping = aes(x = log(avg_expr + 1))) +
geom_histogram(
color = "white",
fill = brewer.pal(n = 3, name = "Set1")[2],
bins = 50
) +
labs(title = "Distribution of average expression values of all genes",
x = "log10(Average expression + 1)",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))ggplot(data = as.data.frame(avg_expr), mapping = aes(x = avg_expr)) +
geom_histogram(
color = "white",
fill = brewer.pal(n = 3, name = "Set1")[2],
bins = 50
) +
scale_x_continuous(
breaks = c(0, 1, 10, 100, 1000, 10000, 20000),
trans = "log1p",
expand = c(0, 0)
) +
scale_y_continuous(breaks = c(0, 1),
expand = c(0, 0),
trans = "log1p") +
labs(title = "Distribution of average expression values of all genes",
x = "log1p(Average expression)",
y = "log1p(Count)") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))num_det <- rowSums(expr > 0)
ggplot(data = as.data.frame(num_det), mapping = aes(x = num_det)) +
geom_histogram(
color = "white",
fill = brewer.pal(n = 3, name = "Set1")[2],
bins = 23
) +
labs(title = "Distribution of number samples in which each gene is detected",
x = "Number of samples",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))The histograms show that many genes are not detected in all or most of the samples, or detected only with low expression. Thus, these genes are filtered out since they are probably not of interest:
# filter expressed genes and add this information to the meta_genes file
# threshold: genes must be detected in at least half of the samples
# and the average TPM must be >= 1
expressed <- rowMeans(expr > 0) >= 0.5 | rowMeans(expr) >= 1
meta_genes$expressed <- expressed
num_det <- rowSums(expr[meta_genes$expressed,] > 0)
ggplot(data = as.data.frame(num_det), mapping = aes(x = num_det)) +
geom_histogram(
color = "white",
fill = brewer.pal(n = 3, name = "Set1")[2],
bins = 23
) +
labs(title = "Distribution of number samples in which each gene is detected",
x = "Number of samples",
y = "Count") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# compute pairwise correlation between the samples
corr_pearson <- cor(log1p(expr[meta_genes$expressed,]))
corr_spearman <- cor(expr[meta_genes$expressed,], method = "spearman")
pheatmap(corr_pearson)hcl_pearson <- hclust(as.dist(1 - corr_pearson))
hcl_spearman <- hclust(as.dist(1 - corr_spearman))
plot(hcl_pearson, labels = anno$age)# PCA dimensionality reduction
pca <-
prcomp(log1p(t(expr[meta_genes$expressed, ])), center = TRUE, scale. = TRUE)
eigs <- pca$sdev^2
plot(1:length(eigs), eigs)ggplot(data = as.data.frame(pca$x),
mapping = aes(
x = PC1,
y = PC2,
#color = as.factor(anno$age)
color = anno$age,
shape = anno$sex
)) +
geom_point(size = 4) +
labs(title = "PCA plot", color = "Age", shape = "Sex") +
scale_color_viridis_c() +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))# MDS dimensionality reduction
mds <- plotMDS(log1p(expr[meta_genes$expressed,]), plot = FALSE)
df <- data.frame(MDS1 = mds$x,
MDS2 = mds$y,
age = anno$age,
shape = anno$sex)
ggplot(df, aes(x = MDS1, y = MDS2, colour = age)) +
geom_point(size = 4) +
labs(title = "MDS plot", color = "Age") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_viridis_c()Highly variable genes identification
Function to estimate the variability of the genes:
estimate_variability <- function(expr){
means <- apply(expr, 1, mean)
vars <- apply(expr, 1, var)
cv2 <- vars / means^2
minMeanForFit <- unname(median(means[which(cv2 > 0.3)]))
useForFit <- means >= minMeanForFit
fit <- glm.fit(x = cbind(a0 = 1, a1tilde = 1/means[useForFit]),
y = cv2[useForFit],
family = Gamma(link = "identity"))
a0 <- unname(fit$coefficients["a0"])
a1 <- unname(fit$coefficients["a1tilde"])
xg <- exp(seq(min(log(means[means>0])), max(log(means)), length.out=1000))
vfit <- a1/xg + a0
df <- ncol(expr) - 1
afit <- a1/means+a0
varFitRatio <- vars/(afit*means^2)
pval <- pchisq(varFitRatio*df,df=df,lower.tail=F)
res <- data.frame(mean = means,
var = vars,
cv2 = cv2,
useForFit = useForFit,
pval = pval,
padj = p.adjust(pval, method="BH"),
row.names = rownames(expr))
return(res)
}Test for significance of over dispersion:
Hierarchical clustering and PCA only with highly variable and expressed genes:
corr_spearman_highvar <-
cor(expr[meta_genes$highvar,], method = "spearman")
hcl_spearman_highvar <- hclust(as.dist(1 - corr_spearman_highvar))
plot(hcl_spearman_highvar, labels = anno$age)pca_highvar <-
prcomp(log1p(t(expr[meta_genes$highvar,])), center = TRUE, scale. = TRUE)
ggplot(data = as.data.frame(pca_highvar$x),
mapping = aes(x = PC1,
y = PC2,
color = anno$age,
shape = anno$sex)) +
geom_point(size = 4) +
labs(title = "PCA plot (only highly variable genes)", color = "Age", shape = "Sex") +
scale_color_viridis_c() +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))Differential expression analysis
To identify significant differential expression changes with age, we used the raw count matrix as recommended for the DEseq2 standard analysis pipeline. Factors and dispersion estimates were calculated for each region separately. We conducted differential expression analysis comparing samples from 3 months to each consecutive time point, using sex as covariate. This is consistent with previously published differential expression analyses performed across whole organs in mice.8 P values were adjusted for multiple testing, and genes with an adjusted P value of less than 0.05 were determined to be statistically significant. Finally, we required a gene to reach statistical significance (after multiple testing correction) in at least 2 pairwise comparisons (e.g. 3 months vs 18 months and 3 months vs 21 months) to be called a differentially expressed gene (DEG). We chose this criterion to retain only genes with robust differential expression patterns across age groups. We recognize that this tends to select against genes that are differentially expressed very late in life (i.e. 3 months vs 28 months).
ANOVA
DE_test <- function(expr,
cond,
ctrl = NULL,
covar = NULL,
padj_method = p.adjust.methods){
pval_fc <- data.frame(t(pbapply(expr, 1, function(e){
dat <- data.frame(y = log1p(e),
cond = cond)
if (! is.null(covar))
dat <- data.frame(dat, covar)
m1 <- lm(y ~ ., data = dat)
m0 <- lm(y ~ . - cond, data = dat)
test <- anova(m1, m0)
pval <- test$Pr[2]
avgs <- tapply(log1p(e), cond, mean)
if (! is.null(ctrl) && sum(cond %in% ctrl) > 0){
fc <- exp(max(avgs[names(avgs) != ctrl]) - avgs[ctrl])
} else{
fc <- exp(max(avgs) - min(avgs))
}
return(c(pval = unname(pval), fc = unname(fc)))
})), row.names = rownames(expr))
padj <- p.adjust(pval_fc$pval, method = padj_method)
return(data.frame(pval_fc, "padj" = padj)[,c("pval","padj","fc")])
}
# expressed genes
res_DE <- DE_test(expr = expr[meta_genes$expressed,],
cond = anno$age,
covar = anno$sex) %>%
tibble::rownames_to_column("gene")res_DE <- res_DE %>%
mutate(DE = padj < 0.1 & fc > 2) %>%
mutate(DEG = ifelse(DE, meta_genes$external_gene_name[meta_genes$expressed], NA))
ggplot(res_DE, aes(
x = log(fc),
y = -log10(padj),
col = DE,
label = DEG
)) +
geom_point() +
geom_text_repel() +
geom_vline(
xintercept = c(log(2), 0),
col = "#303030",
linetype = "dotted"
) +
geom_hline(
yintercept = -log10(0.1),
col = "#303030",
linetype = "dotted"
) +
scale_color_manual(values = c("#909090", "red")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "ANOVA differential expression analysis",
x = "logFC",
y = "-log10(p-value)")Warning: Removed 24353 rows containing missing values (`geom_text_repel()`).
DESeq2
tx2gene <- getBM(attributes = c("ensembl_transcript_id_version",
"ensembl_gene_id_version"),
filters = "ensembl_gene_id_version",
values = rownames(expr),
mart = ensembl) %>%
dplyr::select(ensembl_transcript_id_version, ensembl_gene_id_version)
samples <- list.files("rsem")
files <- file.path("rsem", samples, paste0(samples,".isoforms.results"))
txi <- tximport(files, type = "rsem", tx2gene = tx2gene)reading in files with read_tsv
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
summarizing abundance
summarizing counts
summarizing length
Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors
the design formula contains one or more numeric variables with integer values,
specifying a model with increasing fold change for higher values.
did you mean for this to be a factor? if so, first convert
this variable to a factor using the factor() function
the design formula contains one or more numeric variables that have mean or
standard deviation larger than 5 (an arbitrary threshold to trigger this message).
Including numeric variables with large mean can induce collinearity with the intercept.
Users should center and scale numeric variables in the design to improve GLM convergence.
using counts and average transcript lengths from tximport
dds_filtered <-
dds[intersect(rownames(expr)[meta_genes$expressed], rownames(dds)), ]
dds_filtered <- DESeq(dds_filtered, test = "LRT", reduced = ~ sex)estimating size factors
using 'avgTxLength' from assays(dds), correcting for library size
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
edgeR
samples <- list.files("rsem")
counts <- sapply(samples, function(sample) {
file <- paste0("rsem/", sample, "/", sample, ".genes.results")
quant <- read.csv(file, sep = "\t", header = T)
tpm2 <- setNames(quant$expected_count, quant$gene_id)
return(tpm2)
})
counts <- as.data.frame(counts)
counts <- counts %>%
rownames_to_column(var = "gene_name") %>%
arrange(gene_name) %>%
column_to_rownames(var = "gene_name")
head(counts) SRR21355241 SRR21355242 SRR21355243 SRR21355246
ENSMUSG00000000001.5 445 305 226 149
ENSMUSG00000000003.16 0 0 0 0
ENSMUSG00000000028.16 18 11 10 3
ENSMUSG00000000031.19 0 1 0 2
ENSMUSG00000000037.18 0 1 3 2
ENSMUSG00000000049.12 8 5 6 1
SRR21355249 SRR21355250 SRR21355256 SRR21355360
ENSMUSG00000000001.5 531 297 555 408
ENSMUSG00000000003.16 0 0 0 0
ENSMUSG00000000028.16 27 5 24 14
ENSMUSG00000000031.19 0 0 2 1
ENSMUSG00000000037.18 2 4 8 4
ENSMUSG00000000049.12 9 4 11 5
SRR21355364 SRR21355366 SRR21355367 SRR21355374
ENSMUSG00000000001.5 551 418 894 373
ENSMUSG00000000003.16 0 0 0 0
ENSMUSG00000000028.16 22 8 32 13
ENSMUSG00000000031.19 0 2 0 0
ENSMUSG00000000037.18 0 1 14 2
ENSMUSG00000000049.12 11 9 6 6
SRR21355378 SRR21355380 SRR21355381 SRR21355382
ENSMUSG00000000001.5 355 143 293 462
ENSMUSG00000000003.16 0 0 0 0
ENSMUSG00000000028.16 13 2 14 26
ENSMUSG00000000031.19 1 0 0 1
ENSMUSG00000000037.18 2 4 5 6
ENSMUSG00000000049.12 6 2 9 12
SRR21355383 SRR21355696 SRR21355697 SRR21355698
ENSMUSG00000000001.5 683 466 365 350
ENSMUSG00000000003.16 0 0 0 0
ENSMUSG00000000028.16 16 25 16 9
ENSMUSG00000000031.19 0 0 0 0
ENSMUSG00000000037.18 6 4 1 2
ENSMUSG00000000049.12 9 9 7 0
SRR21355699 SRR21355703 SRR21355705
ENSMUSG00000000001.5 193 224 606
ENSMUSG00000000003.16 0 0 0
ENSMUSG00000000028.16 5 13 24
ENSMUSG00000000031.19 0 0 2
ENSMUSG00000000037.18 0 2 3
ENSMUSG00000000049.12 1 1 14
# define groups for design matrix
age <- factor(anno$age)
sex <- factor(anno$sex)
# convert data to DGEList object
y <- DGEList(counts = counts[meta_genes$expressed,], group = age, sex = sex)
# normalize the data to account for differences in library sizes and sequencing depth
y <- calcNormFactors(y)
# create design matrix
design <- model.matrix( ~ age + sex)
design (Intercept) age12 age15 age18 age21 age26 age28 sexmale
1 1 0 0 1 0 0 0 1
2 1 0 1 0 0 0 0 1
3 1 0 0 0 0 0 0 1
4 1 0 1 0 0 0 0 1
5 1 0 0 0 1 0 0 1
6 1 0 0 1 0 0 0 1
7 1 0 0 0 0 0 0 1
8 1 0 0 0 0 0 1 1
9 1 0 0 0 0 0 0 0
10 1 0 0 1 0 0 0 0
11 1 0 1 0 0 0 0 0
12 1 0 0 0 1 0 0 0
13 1 0 0 0 1 0 0 0
14 1 0 0 1 0 0 0 0
15 1 0 1 0 0 0 0 0
16 1 0 0 0 0 0 0 0
17 1 0 0 0 1 0 0 1
18 1 1 0 0 0 0 0 1
19 1 0 0 0 0 1 0 1
20 1 1 0 0 0 0 0 0
21 1 1 0 0 0 0 0 1
22 1 1 0 0 0 0 0 1
23 1 1 0 0 0 0 0 0
attr(,"assign")
[1] 0 1 1 1 1 1 1 2
attr(,"contrasts")
attr(,"contrasts")$age
[1] "contr.treatment"
attr(,"contrasts")$sex
[1] "contr.treatment"
# estimate dispersion
y <- estimateDisp(y, design)
# fit the negative binomial model
fit <- glmFit(y, design)
# conduct genewise statistical tests for a given coefficient contrast (coef = row of design matrix)
lrt_all <- glmLRT(fit, coef = 2:7)
topTags(lrt_all)Coefficient: age12 age15 age18 age21 age26 age28
logFC.age12 logFC.age15 logFC.age18 logFC.age21
ENSMUSG00000109006.4 0.42549918 0.64432382 0.72138263 0.72678372
ENSMUSG00000092178.3 2.70228069 2.83114302 3.43410901 3.64978902
ENSMUSG00000052229.6 -0.59373785 -0.69719440 -0.92894309 -1.06495887
ENSMUSG00000051242.2 0.67363096 0.74441494 0.85017560 0.96052688
ENSMUSG00000103897.2 0.81683705 0.47975724 0.43610018 0.87429764
ENSMUSG00000073418.5 0.28577065 0.76687301 0.69784255 1.69148018
ENSMUSG00000004894.11 0.35355271 0.80515826 0.59799117 1.00051971
ENSMUSG00000022076.11 -1.07314215 -1.14384988 -1.51811213 -1.46217348
ENSMUSG00000102422.2 0.51006045 -0.50716334 -0.30847966 0.02707808
ENSMUSG00000075602.11 0.01113071 -0.01473805 0.06415556 0.41015285
logFC.age26 logFC.age28 logCPM LR PValue
ENSMUSG00000109006.4 0.6629856 0.85019534 6.715016 202.53240 5.484322e-41
ENSMUSG00000092178.3 3.8196167 4.23159934 2.139035 138.00937 2.636076e-27
ENSMUSG00000052229.6 -1.6205104 -1.19079464 4.239970 127.23686 4.905170e-25
ENSMUSG00000051242.2 0.8819022 1.29168983 4.440796 92.90605 7.540477e-18
ENSMUSG00000103897.2 1.0979546 0.91302672 4.382299 82.47698 1.098958e-15
ENSMUSG00000073418.5 1.6704738 2.70322000 3.220873 80.83753 2.398670e-15
ENSMUSG00000004894.11 0.9139761 1.28641546 3.724848 80.52670 2.781013e-15
ENSMUSG00000022076.11 -2.0742497 -2.35490774 1.708595 68.93775 6.750994e-13
ENSMUSG00000102422.2 -8.1383376 -1.72703022 2.182698 63.94556 7.080725e-12
ENSMUSG00000075602.11 0.8262527 -0.08940765 5.247879 63.81576 7.525836e-12
FDR
ENSMUSG00000109006.4 1.335816e-36
ENSMUSG00000092178.3 3.210345e-23
ENSMUSG00000052229.6 3.982508e-21
ENSMUSG00000051242.2 4.591585e-14
ENSMUSG00000103897.2 5.353464e-12
ENSMUSG00000073418.5 9.676733e-12
ENSMUSG00000004894.11 9.676733e-12
ENSMUSG00000022076.11 2.055424e-09
ENSMUSG00000102422.2 1.833068e-08
ENSMUSG00000075602.11 1.833068e-08
# multiple testing correction
decide_dif_all <-
decideTests.DGELRT(
lrt_all,
adjust.method = "holm",
p.value = 0.1#,
#lfc = log(2)
)
summary(decide_dif_all) age28-age26-age21-age18-age15-age12
NotSig 24285
Sig 72
de_genes_edgeR <- rownames(decide_dif_all[decide_dif_all[,1] != 0,])
de_genes_edgeR_ext_name <- meta_genes[de_genes_edgeR,]$external_gene_name
de_genes_edgeR_ext_name [1] "Hapln2" "Cabp7" "Met" "Bcas1"
[5] "Ctsz" "Sparc" "Gfap" "Klhl1"
[9] "Pdlim2" "Apod" "Banp" "Casp1"
[13] "Snca" "Lcn2" "Gatm" "Fermt1"
[17] "Stmn2" "Gnb4" "Hmgcs2" "Calb1"
[21] "Tmeff1" "Plekhb1" "Mt1" "Rims3"
[25] "Zdhhc15" "Camk4" "Rgs4" "Nrn1"
[29] "Flywch1" "Abca8a" "Tmcc2" "Kdm7a"
[33] "Lsm11" "Cirbp" "Vwa5b2" "Marcksl1"
[37] "Eif5a2" "Pcdhb9" "Pcdhb2" "Gpr17"
[41] "H2-Q7" "Cadm2" "Kndc1" "Cst7"
[45] "Lyz2" "Gm20634" "2410018L13Rik" "H2-Q6"
[49] "C4b" "Ly6a" "Ass1" "Igkc"
[53] "Gm4294" "Ifi27l2a" "Ly6c1" "Mdfic2"
[57] "Gm1979" "Tnnc1" "Gm45351" "Neat1"
[61] "Obox3-ps7" "Iqschfp" "Pcdhga7" "Pcdhga8"
[65] "Gm42756" "Gm42715" "Gm44002" "B230209E15Rik"
[69] "9330121K16Rik" "Gm32687" "AA414992" ""
# comparison of 12 and 3 months
lrt <- glmLRT(fit, coef = 2)
decide_dif <-
decideTests.DGELRT(
lrt,
adjust.method = "holm",
p.value = 0.1,
lfc = log(2)
)
ggplot(
data = as.data.frame(lrt$table),
mapping = aes(
x = abs(logFC),
y = -log10(p.adjust(PValue, method = "holm")),
label = meta_genes[rownames(y),]$external_gene_name,
color = as.factor(decide_dif)
)
) +
geom_point() +
geom_vline(xintercept = log(2), linetype = "dashed") +
geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
geom_text_repel() +
labs(
title = "Differential expression 3 vs. 12 months",
x = "LogFC",
y = "-10log(p-value)",
color = "DE"
) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))Warning: ggrepel: 24298 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
# comparison of 21 and 3 months
lrt <- glmLRT(fit, coef = 5)
decide_dif <-
decideTests.DGELRT(
lrt,
adjust.method = "holm",
p.value = 0.1,
lfc = log(2)
)
ggplot(
data = as.data.frame(lrt$table),
mapping = aes(
x = abs(logFC),
y = -log10(p.adjust(PValue, method = "holm")),
label = meta_genes[rownames(y),]$external_gene_name,
color = as.factor(decide_dif)
)
) +
geom_point() +
geom_vline(xintercept = log(2), linetype = "dashed") +
geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
geom_text_repel() +
labs(
title = "Differential expression 3 vs. 21 months",
x = "LogFC",
y = "-10log(p-value)",
color = "DE"
) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))Warning: ggrepel: 24297 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
# comparison of 28 and 3 months
lrt <- glmLRT(fit, coef = 7)
decide_dif <-
decideTests.DGELRT(
lrt,
adjust.method = "holm",
p.value = 0.1,
lfc = log(2)
)
ggplot(
data = as.data.frame(lrt$table),
mapping = aes(
x = abs(logFC),
y = -log10(p.adjust(PValue, method = "holm")),
label = meta_genes[rownames(y),]$external_gene_name,
color = as.factor(decide_dif)
)
) +
geom_point() +
geom_vline(xintercept = log(2), linetype = "dashed") +
geom_hline(yintercept = -log10(0.1), linetype = "dashed") +
geom_text_repel() +
labs(
title = "Differential expression 3 vs. 28 months",
x = "LogFC",
y = "-10log(p-value)",
color = "DE"
) +
scale_color_brewer(palette = "Set1") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))Warning: ggrepel: 24301 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
c4b <-
data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "C4b", ]$ensembl_gene_id_version,])), age = anno$age)
c4b <-
c4b %>% group_by(age) %>% summarise(expression = mean(expression))
c4b# A tibble: 7 × 2
age expression
<int> <dbl>
1 3 1.25
2 12 1.56
3 15 2.04
4 18 1.92
5 21 4.28
6 26 3.84
7 28 7.47
gpr17 <-
data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "Gpr17", ]$ensembl_gene_id_version,])), age = anno$age)
gpr17 <-
gpr17 %>% group_by(age) %>% summarise(expression = mean(expression))
gpr17# A tibble: 7 × 2
age expression
<int> <dbl>
1 3 8.68
2 12 5.91
3 15 5.08
4 18 4.17
5 21 4.27
6 26 3.03
7 28 3.88
H2Q6 <-
data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "H2-Q6", ]$ensembl_gene_id_version,])), age = anno$age)
H2Q6 <-
H2Q6 %>% group_by(age) %>% summarise(expression = mean(expression))
H2Q6# A tibble: 7 × 2
age expression
<int> <dbl>
1 3 0.395
2 12 1.21
3 15 1.61
4 18 1.71
5 21 2.44
6 26 4.14
7 28 2.1
H2Q7 <-
data.frame(expression = unlist(as.vector(expr[meta_genes[meta_genes$external_gene_name == "H2-Q7", ]$ensembl_gene_id_version,])), age = anno$age)
H2Q7 <-
H2Q7 %>% group_by(age) %>% summarise(expression = mean(expression))
H2Q7# A tibble: 7 × 2
age expression
<int> <dbl>
1 3 0.617
2 12 1.52
3 15 1.97
4 18 2.46
5 21 3.34
6 26 5.36
7 28 1.89
ggplot(data = c4b,
mapping = aes(
x = as.factor(age),
y = expression,
fill = as.factor(age)
)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_viridis_d() +
labs(title = "Expression of C4b over time",
x = "Age (months)",
y = "TPM",
fill = "Age")ggplot(data = gpr17,
mapping = aes(
x = as.factor(age),
y = expression,
fill = as.factor(age)
)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_viridis_d() +
labs(title = "Expression of Gpr17 over time",
x = "Age (months)",
y = "TPM",
fill = "Age")ggplot(data = H2Q6,
mapping = aes(
x = as.factor(age),
y = expression,
fill = as.factor(age)
)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_viridis_d() +
labs(title = "Expression of H2-Q6 over time",
x = "Age (months)",
y = "TPM",
fill = "Age")ggplot(data = H2Q7,
mapping = aes(
x = as.factor(age),
y = expression,
fill = as.factor(age)
)) +
geom_bar(stat = "identity") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_fill_viridis_d() +
labs(title = "Expression of H2-Q7 over time",
x = "Age (months)",
y = "TPM",
fill = "Age")Comparison of DE analysis methods
UpSet plot:
eR <- decide_dif_all
d2 <- padj * 1
d2 <- ifelse(is.na(d2), 0, d2)
anov <- as.data.frame(res_DE$DE * 1)
rownames(anov) <- res_DE$gene
df <- cbind(eR, d2, anov)
colnames(df) <- c("edgeR", "DESeq2", "ANOVA")
upset(as.data.frame(df), sets = colnames(df)) [1] "Hapln2" "Cabp7" "Met" "Ctsz"
[5] "Sparc" "Klhl1" "Apod" "Fermt1"
[9] "Gnb4" "Hmgcs2" "Calb1" "Camk4"
[13] "Rgs4" "Nrn1" "Flywch1" "Abca8a"
[17] "Lsm11" "Cirbp" "Marcksl1" "Pcdhb9"
[21] "Pcdhb2" "Gpr17" "H2-Q7" "Cadm2"
[25] "Cst7" "Lyz2" "H2-Q6" "C4b"
[29] "Ass1" "Gm45351" "Neat1" "Pcdhga7"
[33] "Gm42756" "B230209E15Rik" "AA414992"
Comparison of p-values:
# combine LogFC and plot
df <- as.data.frame(cbind(p.adjust(lrt_all$table$PValue, method = "holm"), res_DE$padj))
colnames(df) <- c("padj_edgeR", "padj_anova")
ggplot(data = df, mapping = aes(x = -log10(padj_edgeR), y = -log10(padj_anova))) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "blue") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette = "Set1") +
labs(title = "Comparison of adjusted P-values", x = "-log10 P-value (egdeR)", y = "-log10 P-value (ANOVA)")df <- as.data.frame(cbind(lrt_all$table$PValue, res_DE$pval))
colnames(df) <- c("p_edgeR", "p_anova")
ggplot(data = df, mapping = aes(x = -log10(p_edgeR), y = -log10(p_anova))) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "blue") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = "Comparison of P-values", x = "-log10 P-value (egdeR)", y = "-log10 P-value (ANOVA)")Comparison of logFC:
Grouping of the identified DEGs
DEG <- de_genes_edgeR
avg_expr <- as.data.frame(sapply(sort(unique(anno$age))[1:5], function(age) {
rowMeans(expr[, which(anno$age == age)])
}))
avg_expr$V6 <- unlist(as.vector(expr[, which(anno$age == 26)]))
avg_expr$V7 <- unlist(as.vector(expr[, which(anno$age == 28)]))
colnames(avg_expr) <- sort(unique(anno$age))
max_age_DEG <-
setNames(colnames(avg_expr)[apply(avg_expr[DEG, ], 1, which.max)], DEG)max_age_DEG
12 18 21 26 28 3
4 2 3 25 19 19
avg_expr_DEG_list <- tapply(names(max_age_DEG), max_age_DEG, function(x) avg_expr[x,])
scaled_expr_DEG_list <- lapply(avg_expr_DEG_list, function(x) t(scale(t(x))))
layout(matrix(1:8, nrow = 2, byrow = T))
par(mar=c(3,3,3,3))
for(age in names(scaled_expr_DEG_list))
boxplot(scaled_expr_DEG_list[[age]],
main = paste0(age, " (", nrow(scaled_expr_DEG_list[[age]]), ")"))corr_DEG <- cor(t(avg_expr[DEG,]), method = "spearman")
hcl_DEG <- hclust(as.dist(1 - corr_DEG), method = "complete")
plot(hcl_DEG)cl_DEG <- cutree(hcl_DEG, k = 4)
heatmap.2(corr_DEG, Rowv = as.dendrogram(hcl_DEG), Colv = as.dendrogram(hcl_DEG),
trace = "none", scale = "none", labRow = NA, labCol = NA, col = viridis,
ColSideColors = rainbow(15)[cl_DEG])avg_expr_DEG_list <- tapply(names(cl_DEG), cl_DEG, function(x) avg_expr[x,])
scaled_expr_DEG_list <- lapply(avg_expr_DEG_list, function(x) t(scale(t(x))))
layout(matrix(1:4, nrow = 2, byrow = T))
for(cl in 1:4)
boxplot(scaled_expr_DEG_list[[cl]],
main = paste0(cl, " (", nrow(scaled_expr_DEG_list[[cl]]), ")")) [1] "ENSMUSG00000004894.11" "ENSMUSG00000013523.14" "ENSMUSG00000020932.15"
[4] "ENSMUSG00000022090.11" "ENSMUSG00000022548.15" "ENSMUSG00000025889.14"
[7] "ENSMUSG00000027199.15" "ENSMUSG00000027500.11" "ENSMUSG00000027669.15"
[10] "ENSMUSG00000030701.18" "ENSMUSG00000045193.14" "ENSMUSG00000051242.2"
[13] "ENSMUSG00000068129.6" "ENSMUSG00000069516.9" "ENSMUSG00000073418.5"
[16] "ENSMUSG00000076441.10" "ENSMUSG00000076609.3" "ENSMUSG00000091049.9"
[19] "ENSMUSG00000091898.9" "ENSMUSG00000092178.3" "ENSMUSG00000092274.5"
[22] "ENSMUSG00000104674.3" "ENSMUSG00000108158.2" "ENSMUSG00000109006.4"
[25] "ENSMUSG00000113342.2"
[1] ""
[1] "ENSMUSG00000009075.3" "ENSMUSG00000016256.11" "ENSMUSG00000018593.14"
[4] "ENSMUSG00000025888.7" "ENSMUSG00000026822.15" "ENSMUSG00000027356.9"
[7] "ENSMUSG00000032890.18" "ENSMUSG00000040097.17" "ENSMUSG00000041828.16"
[10] "ENSMUSG00000042066.17" "ENSMUSG00000046613.20" "ENSMUSG00000051599.5"
[13] "ENSMUSG00000060550.17" "ENSMUSG00000066129.18" "ENSMUSG00000073409.13"
[16] "ENSMUSG00000079017.4" "ENSMUSG00000090667.3" "ENSMUSG00000093773.2"
[19] "ENSMUSG00000103472.2" "ENSMUSG00000103897.2" "ENSMUSG00000121378.1"
[1] ""
[1] "ENSMUSG00000009376.16" "ENSMUSG00000022076.11" "ENSMUSG00000027875.13"
[4] "ENSMUSG00000028347.15" "ENSMUSG00000047945.7" "ENSMUSG00000052229.6"
[7] "ENSMUSG00000073164.12" "ENSMUSG00000078377.6" "ENSMUSG00000102422.2"
[1] ""
[1] "ENSMUSG00000025316.17" "ENSMUSG00000028222.3" "ENSMUSG00000031765.9"
[4] "ENSMUSG00000033906.6" "ENSMUSG00000038128.8" "ENSMUSG00000038530.12"
[7] "ENSMUSG00000039114.17" "ENSMUSG00000042599.9" "ENSMUSG00000044847.14"
[10] "ENSMUSG00000050192.9" "ENSMUSG00000064115.14" "ENSMUSG00000070392.6"
[13] "ENSMUSG00000075602.11" "ENSMUSG00000079018.11" "ENSMUSG00000107023.3"
[16] "ENSMUSG00000110249.2" "ENSMUSG00000112640.2"
Making sense of the genes
Frequency-based method DAVID (Not optimal as we have low number of DE genes)
#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))
write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==2)), "ensembl_gene_id"],
file = "genes_C2.txt",
quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
file = "genes_expressed.txt",
quote = F, row.names = F, col.names = F)Cluster 3 is the one with down expression progressively across ages, therefore:
#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))
write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==3)), "ensembl_gene_id"],
file = "genes_C3.txt",
quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
file = "genes_expressed.txt",
quote = F, row.names = F, col.names = F)Cluster 1 is the one with up expression progressively across ages, therefore:
#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))
write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==1)), "ensembl_gene_id"],
file = "genes_C1.txt",
quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
file = "genes_expressed.txt",
quote = F, row.names = F, col.names = F)And now cluster 4:
#I have to delete .version
expr_no_idversion <- expr
rownames(expr_no_idversion) <- sub("\\.\\d+$", "", rownames(expr_no_idversion))
write.table(meta_genes[meta_genes$ensembl_gene_id_version %in% names(which(cl_DEG==4)), "ensembl_gene_id"],
file = "genes_C4.txt",
quote = F, row.names = F, col.names = F)
write.table(meta_genes[meta_genes$expressed, "ensembl_gene_id"],
file = "genes_expressed.txt",
quote = F, row.names = F, col.names = F)Rank-based method GSEA
DE_A26 <- DE_test(expr = expr[meta_genes$expressed,],
cond = anno$age == "26",
ctrl = "FALSE",
covar = anno %>% dplyr::select(sex)) %>%
tibble::rownames_to_column("gene")
scores <- setNames(sign(log(DE_A26$fc)) * (-log10(DE_A26$pval)),
setNames(meta_genes$ensembl_gene_id,
meta_genes$ensembl_gene_id_version)[DE_A26$gene])
scores_ordered <- sort(scores, decreasing=T)
library(msigdbr)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)
library(fgsea)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
stats = scores_ordered[-1],
minSize = 15,
maxSize = 500)Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.34% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : There were 1 pathways for which P-values were not calculated
properly due to unbalanced (positive and negative) gene-level statistic values.
For such pathways pval, padj, NES, log2err are set to NA. You can try to
increase the value of the argument nPermSimple (for example set it nPermSimple
= 10000)
pathway
1: LEI_HOXC8_TARGETS_DN
2: ZHANG_INTERFERON_RESPONSE
3: REACTOME_BUTYRATE_RESPONSE_FACTOR_1_BRF1_BINDS_AND_DESTABILIZES_MRNA
4: KIM_LRRC3B_TARGETS
5: REACTOME_TRISTETRAPROLIN_TTP_ZFP36_BINDS_AND_DESTABILIZES_MRNA
6: WP_HAIR_FOLLICLE_DEVELOPMENT_ORGANOGENESIS_PART_2_OF_3
7: REACTOME_INTERFERON_ALPHA_BETA_SIGNALING
8: EINAV_INTERFERON_SIGNATURE_IN_CANCER
9: RADAEVA_RESPONSE_TO_IFNA1_UP
10: BOYAULT_LIVER_CANCER_SUBCLASS_G5_DN
pval padj log2err ES NES size
1: 0.001904233 0.4432369 0.4550599 0.8103573 1.962052 16
2: 0.005116386 0.4432369 0.4070179 0.7830670 1.956420 19
3: 0.002291917 0.4432369 0.4317077 0.7516141 1.819821 16
4: 0.007041256 0.4432369 0.4070179 0.6902862 1.787900 23
5: 0.002498060 0.4432369 0.4317077 0.7326232 1.773840 16
6: 0.011207564 0.4432369 0.3807304 0.6614050 1.771704 31
7: 0.013389296 0.4432369 0.3807304 0.6125033 1.768737 52
8: 0.003620144 0.4432369 0.4317077 0.6753948 1.744792 24
9: 0.009164758 0.4432369 0.3807304 0.6131793 1.727130 43
10: 0.008353275 0.4432369 0.3807304 0.6639865 1.719782 23
leadingEdge
1: ENSMUSG00000003849,ENSMUSG00000029304,ENSMUSG00000001349,ENSMUSG00000052957,ENSMUSG00000032085,ENSMUSG00000035783
2: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000070034,ENSMUSG00000028037,ENSMUSG00000039236,ENSMUSG00000026104,...
3: ENSMUSG00000021127,ENSMUSG00000024472,ENSMUSG00000028322,ENSMUSG00000032410
4: ENSMUSG00000035692,ENSMUSG00000028037,ENSMUSG00000060591,ENSMUSG00000060550,ENSMUSG00000060586,ENSMUSG00000060802,...
5: ENSMUSG00000044786,ENSMUSG00000009470,ENSMUSG00000024472,ENSMUSG00000028322,ENSMUSG00000032410
6: ENSMUSG00000022510,ENSMUSG00000015647,ENSMUSG00000041324,ENSMUSG00000001761,ENSMUSG00000040055,ENSMUSG00000028163,...
7: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000053113,ENSMUSG00000039236,ENSMUSG00000060591,ENSMUSG00000060550,...
8: ENSMUSG00000035692,ENSMUSG00000015846,ENSMUSG00000047407,ENSMUSG00000070034,ENSMUSG00000028037,ENSMUSG00000027078,...
9: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000028037,ENSMUSG00000027078,ENSMUSG00000060591,ENSMUSG00000057329,...
10: ENSMUSG00000037820,ENSMUSG00000027907,ENSMUSG00000028037,ENSMUSG00000006519,ENSMUSG00000026104,ENSMUSG00000052397,...
library(msigdbr)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C8")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)
library(fgsea)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
stats = scores_ordered[-1],
minSize = 15,
maxSize = 500)Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.34% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
pathway
1: DESCARTES_FETAL_STOMACH_PARIETAL_AND_CHIEF_CELLS
2: BUSSLINGER_GASTRIC_REG3A_POSITIVE_CELLS
3: DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_VASCULAR_SMOOTH_MUSCLE_CELLS
4: DESCARTES_FETAL_STOMACH_MESOTHELIAL_CELLS
5: DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_PERICYTES
6: RUBENSTEIN_SKELETAL_MUSCLE_ENDOTHELIAL_CELLS
7: AIZARANI_LIVER_C12_NK_NKT_CELLS_4
8: DURANTE_ADULT_OLFACTORY_NEUROEPITHELIUM_FIBROBLASTS_STROMAL_CELLS
9: FAN_EMBRYONIC_CTX_BIG_GROUPS_BRAIN_IMMUNE
10: AIZARANI_LIVER_C17_HEPATOCYTES_3
pval padj log2err ES NES size
1: 0.002648619 0.3484108 0.4317077 0.6931856 1.758385 22
2: 0.005354202 0.3484108 0.4070179 0.6488476 1.724220 29
3: 0.044284243 0.3484108 0.2165428 0.5535044 1.618119 89
4: 0.011193307 0.3484108 0.3807304 0.6286719 1.601858 23
5: 0.044605809 0.3484108 0.2165428 0.5543262 1.598670 75
6: 0.052261307 0.3484108 0.1957890 0.5336202 1.579324 126
7: 0.012336074 0.3484108 0.3807304 0.5906395 1.569540 29
8: 0.044375645 0.3484108 0.2165428 0.5392578 1.563221 78
9: 0.052208835 0.3484108 0.1957890 0.5205918 1.544664 130
10: 0.032188841 0.3484108 0.2616635 0.5403901 1.540768 54
leadingEdge
1: ENSMUSG00000028553,ENSMUSG00000032278,ENSMUSG00000019326,ENSMUSG00000019232,ENSMUSG00000041372,ENSMUSG00000051497,...
2: ENSMUSG00000025492,ENSMUSG00000026822,ENSMUSG00000060550,ENSMUSG00000060586,ENSMUSG00000026104,ENSMUSG00000060802,...
3: ENSMUSG00000001349,ENSMUSG00000012428,ENSMUSG00000044786,ENSMUSG00000029761,ENSMUSG00000053113,ENSMUSG00000036256,...
4: ENSMUSG00000048482,ENSMUSG00000052957,ENSMUSG00000006649,ENSMUSG00000028871,ENSMUSG00000038119,ENSMUSG00000020363,...
5: ENSMUSG00000037872,ENSMUSG00000025608,ENSMUSG00000009687,ENSMUSG00000053113,ENSMUSG00000038393,ENSMUSG00000001930,...
6: ENSMUSG00000041445,ENSMUSG00000025492,ENSMUSG00000025608,ENSMUSG00000035692,ENSMUSG00000038393,ENSMUSG00000034485,...
7: ENSMUSG00000042385,ENSMUSG00000038393,ENSMUSG00000021998,ENSMUSG00000004612,ENSMUSG00000025491,ENSMUSG00000030847,...
8: ENSMUSG00000025927,ENSMUSG00000025492,ENSMUSG00000037820,ENSMUSG00000050370,ENSMUSG00000047330,ENSMUSG00000029761,...
9: ENSMUSG00000025492,ENSMUSG00000035692,ENSMUSG00000058715,ENSMUSG00000034165,ENSMUSG00000027907,ENSMUSG00000009687,...
10: ENSMUSG00000003617,ENSMUSG00000032081,ENSMUSG00000037095,ENSMUSG00000030895,ENSMUSG00000026874,ENSMUSG00000026193,...
DE_A28 <- DE_test(expr = expr[meta_genes$expressed,],
cond = anno$age == "28",
ctrl = "FALSE",
covar = anno %>% dplyr::select(sex)) %>%
tibble::rownames_to_column("gene")
scores <- setNames(sign(log(DE_A28$fc)) * (-log10(DE_A28$pval)),
setNames(meta_genes$ensembl_gene_id,
meta_genes$ensembl_gene_id_version)[DE_A28$gene])
scores_ordered <- sort(scores, decreasing=T)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
stats = scores_ordered[-1],
minSize = 15,
maxSize = 500)Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.33% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
pathway pval padj log2err
1: LEIN_OLIGODENDROCYTE_MARKERS 0.04676754 0.5640283 0.2450418
2: BILANGES_SERUM_AND_RAPAMYCIN_SENSITIVE_GENES 0.03851641 0.5640283 0.2765006
3: REACTOME_EUKARYOTIC_TRANSLATION_ELONGATION 0.05170068 0.5640283 0.2311267
4: WP_PROTEASOME_DEGRADATION 0.02025484 0.5640283 0.3524879
5: KIM_LRRC3B_TARGETS 0.01088003 0.5640283 0.3807304
6: REACTOME_EUKARYOTIC_TRANSLATION_INITIATION 0.06274510 0.5640283 0.2042948
7: KEGG_RIBOSOME 0.04938272 0.5640283 0.2377938
8: REACTOME_SELENOAMINO_ACID_METABOLISM 0.06442953 0.5640283 0.2042948
9: WP_CYTOPLASMIC_RIBOSOMAL_PROTEINS 0.05226960 0.5640283 0.2311267
10: CHNG_MULTIPLE_MYELOMA_HYPERPLOID_UP 0.02237833 0.5640283 0.3524879
ES NES size
1: 0.6375030 2.040223 75
2: 0.6285894 2.020904 69
3: 0.6129909 1.983928 88
4: 0.6194357 1.957097 60
5: 0.7254219 1.954793 23
6: 0.5989848 1.950601 115
7: 0.6001111 1.934551 83
8: 0.6001178 1.932356 112
9: 0.6000208 1.927173 86
10: 0.6141116 1.911793 53
leadingEdge
1: ENSMUSG00000011884,ENSMUSG00000045087,ENSMUSG00000036098,ENSMUSG00000026519,ENSMUSG00000036503,ENSMUSG00000050121,...
2: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000061477,ENSMUSG00000002395,...
3: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
4: ENSMUSG00000091705,ENSMUSG00000005625,ENSMUSG00000016206,ENSMUSG00000031429,ENSMUSG00000018286,ENSMUSG00000030603,...
5: ENSMUSG00000079339,ENSMUSG00000028037,ENSMUSG00000024661,ENSMUSG00000035042,ENSMUSG00000037321,ENSMUSG00000069516,...
6: ENSMUSG00000070319,ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000016554,ENSMUSG00000041453,ENSMUSG00000029614,...
7: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
8: ENSMUSG00000026986,ENSMUSG00000040354,ENSMUSG00000031584,ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000063179,...
9: ENSMUSG00000000740,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000029614,ENSMUSG00000060036,ENSMUSG00000061477,...
10: ENSMUSG00000070319,ENSMUSG00000040952,ENSMUSG00000041453,ENSMUSG00000032376,ENSMUSG00000062997,ENSMUSG00000024608,...
DE_A3 <- DE_test(expr = expr[meta_genes$expressed,],
cond = anno$age == "3",
ctrl = "FALSE",
covar = anno %>% dplyr::select(sex)) %>%
tibble::rownames_to_column("gene")
scores <- setNames(sign(log(DE_A3$fc)) * (-log10(DE_A3$pval)),
setNames(meta_genes$ensembl_gene_id,
meta_genes$ensembl_gene_id_version)[DE_A3$gene])
scores_ordered <- sort(scores, decreasing=T)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
stats = scores_ordered,
minSize = 15,
maxSize = 500)Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.41% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
pathway
1: REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS
2: REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
3: WP_ELECTRON_TRANSPORT_CHAIN_OXPHOS_SYSTEM_IN_MITOCHONDRIA
4: REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT
5: REACTOME_COMPLEX_I_BIOGENESIS
6: WP_MITOCHONDRIAL_COMPLEX_I_ASSEMBLY_MODEL_OXPHOS_SYSTEM
7: KEGG_OXIDATIVE_PHOSPHORYLATION
8: WP_OXIDATIVE_PHOSPHORYLATION
9: KEGG_PARKINSONS_DISEASE
10: MALONEY_RESPONSE_TO_17AAG_DN
pval padj log2err ES NES size
1: 3.862413e-16 1.736541e-12 1.0376962 0.6616233 2.526326 125
2: 1.180405e-14 2.653551e-11 0.9865463 0.6789028 2.522467 102
3: 1.871573e-12 2.103648e-09 0.8986712 0.6550337 2.426148 101
4: 1.772006e-14 2.655646e-11 0.9759947 0.6038376 2.404693 171
5: 1.051930e-08 5.254977e-06 0.7477397 0.6813267 2.299007 56
6: 2.132274e-08 9.586704e-06 0.7337620 0.6807924 2.285191 55
7: 9.838494e-11 8.846773e-08 0.8390889 0.5921459 2.255667 122
8: 5.273679e-07 1.481904e-04 0.6594444 0.6427689 2.194115 59
9: 3.723448e-09 2.092578e-06 0.7749390 0.5659178 2.151906 121
10: 1.412064e-07 4.232426e-05 0.6901325 0.6025443 2.145079 77
leadingEdge
1: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
2: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
3: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
4: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
5: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000035142,ENSMUSG00000064341,ENSMUSG00000064368,...
6: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000035142,ENSMUSG00000064341,ENSMUSG00000064368,...
7: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,ENSMUSG00000021520,...
8: ENSMUSG00000064363,ENSMUSG00000064345,ENSMUSG00000064367,ENSMUSG00000064341,ENSMUSG00000064357,ENSMUSG00000064368,...
9: ENSMUSG00000064370,ENSMUSG00000064363,ENSMUSG00000025889,ENSMUSG00000064345,ENSMUSG00000064354,ENSMUSG00000064367,...
10: ENSMUSG00000025889,ENSMUSG00000038489,ENSMUSG00000009013,ENSMUSG00000038607,ENSMUSG00000022205,ENSMUSG00000057278,...
DE_A12 <- DE_test(expr = expr[meta_genes$expressed,],
cond = anno$age == "12",
ctrl = "FALSE",
covar = anno %>% dplyr::select(sex)) %>%
tibble::rownames_to_column("gene")
scores <- setNames(sign(log(DE_A12$fc)) * (-log10(DE_A12$pval)),
setNames(meta_genes$ensembl_gene_id,
meta_genes$ensembl_gene_id_version)[DE_A12$gene])
scores_ordered <- sort(scores, decreasing=T)
genesets_celltype <- msigdbr(species = "Mus musculus", category = "C2")
genesets_celltype_list <- tapply(genesets_celltype$ensembl_gene, genesets_celltype$gs_name, list)
fgsea_kegg <- fgsea(pathways = genesets_celltype_list,
stats = scores_ordered,
minSize = 15,
maxSize = 500)Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.44% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : There were 33 pathways for which P-values were not calculated
properly due to unbalanced (positive and negative) gene-level statistic values.
For such pathways pval, padj, NES, log2err are set to NA. You can try to
increase the value of the argument nPermSimple (for example set it nPermSimple
= 10000)
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some of the pathways the P-values were likely overestimated. For
such pathways log2err is set to NA.
pathway
1: PID_EPHA2_FWD_PATHWAY
2: REACTOME_INTERACTION_BETWEEN_L1_AND_ANKYRINS
3: PID_HIF2PATHWAY
4: STARK_PREFRONTAL_CORTEX_22Q11_DELETION_UP
5: PID_RHOA_REG_PATHWAY
6: DACOSTA_UV_RESPONSE_VIA_ERCC3_TTD_DN
7: REACTOME_NRAGE_SIGNALS_DEATH_THROUGH_JNK
8: PID_UPA_UPAR_PATHWAY
9: WP_MFAP5_EFFECT_ON_PERMEABILITY_AND_MOTILITY_OF_ENDOTHELIAL_CELLS_VIA_CYTOSKELETON_REARRANGEMENT
10: GENTILE_UV_RESPONSE_CLUSTER_D4
pval padj log2err ES NES size
1: 6.534150e-05 4.147977e-03 0.5384341 0.7627557 1.860449 19
2: 7.441880e-05 4.485923e-03 0.5384341 0.6984472 1.811016 28
3: 1.127958e-04 5.861183e-03 0.5384341 0.6815470 1.787110 31
4: 1.946899e-12 4.344504e-09 0.8986712 0.5771378 1.780627 194
5: 3.555479e-05 2.783878e-03 0.5573322 0.6339556 1.774314 46
6: 2.536675e-06 3.573722e-04 0.6272567 0.5902205 1.748159 82
7: 1.980133e-05 1.747264e-03 0.5756103 0.6089109 1.743069 57
8: 2.131539e-04 9.513058e-03 0.5188481 0.6646845 1.731582 29
9: 7.830164e-04 2.314306e-02 0.4772708 0.7153930 1.727642 18
10: 7.336258e-05 4.485923e-03 0.5384341 0.6042992 1.718746 54
leadingEdge
1: ENSMUSG00000034342,ENSMUSG00000033721,ENSMUSG00000027954,ENSMUSG00000032737,ENSMUSG00000002489,ENSMUSG00000027646,...
2: ENSMUSG00000056258,ENSMUSG00000031543,ENSMUSG00000069601,ENSMUSG00000020315,ENSMUSG00000057738,ENSMUSG00000064329,...
3: ENSMUSG00000036450,ENSMUSG00000027954,ENSMUSG00000030103,ENSMUSG00000024140,ENSMUSG00000009406,ENSMUSG00000062960,...
4: ENSMUSG00000029673,ENSMUSG00000056608,ENSMUSG00000050310,ENSMUSG00000042364,ENSMUSG00000005089,ENSMUSG00000047013,...
5: ENSMUSG00000035133,ENSMUSG00000033721,ENSMUSG00000066406,ENSMUSG00000004677,ENSMUSG00000031442,ENSMUSG00000058230,...
6: ENSMUSG00000031586,ENSMUSG00000037646,ENSMUSG00000035614,ENSMUSG00000038126,ENSMUSG00000033278,ENSMUSG00000001151,...
7: ENSMUSG00000033721,ENSMUSG00000002489,ENSMUSG00000021708,ENSMUSG00000066406,ENSMUSG00000031442,ENSMUSG00000028059,...
8: ENSMUSG00000027087,ENSMUSG00000040249,ENSMUSG00000058325,ENSMUSG00000027646,ENSMUSG00000031955,ENSMUSG00000026193,...
9: ENSMUSG00000030516,ENSMUSG00000021823,ENSMUSG00000029528,ENSMUSG00000027087,ENSMUSG00000022836,ENSMUSG00000013936,...
10: ENSMUSG00000030103,ENSMUSG00000032702,ENSMUSG00000055065,ENSMUSG00000040761,ENSMUSG00000028479,ENSMUSG00000021838,...
plotEnrichment(genesets_celltype_list[["PID_EPHA2_FWD_PATHWAY"]],scores_ordered) + labs(title="PID_EPHA2_FWD_PATHWAY") pathway pval
1: WP_ELECTRON_TRANSPORT_CHAIN_OXPHOS_SYSTEM_IN_MITOCHONDRIA 4.393021e-10
2: REACTOME_COMPLEX_I_BIOGENESIS 7.298615e-07
3: WP_OXIDATIVE_PHOSPHORYLATION 7.669988e-07
4: REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 1.933683e-07
5: REACTOME_ANTIMICROBIAL_PEPTIDES 6.578347e-04
6: WP_MITOCHONDRIAL_COMPLEX_I_ASSEMBLY_MODEL_OXPHOS_SYSTEM 6.598843e-05
7: LEIN_OLIGODENDROCYTE_MARKERS 6.962380e-06
8: KORKOLA_TERATOMA 2.086173e-04
9: KEGG_DRUG_METABOLISM_CYTOCHROME_P450 5.650876e-04
10: MOOTHA_VOXPHOS 4.863726e-06
padj log2err ES NES size
1: 3.797451e-07 0.8140358 -0.4826310 -2.503480 101
2: 1.551129e-04 0.6594444 -0.5342649 -2.359534 56
3: 1.555962e-04 0.6594444 -0.5096109 -2.263485 59
4: 6.164306e-05 0.6901325 -0.4280511 -2.240438 102
5: 2.038831e-02 0.4772708 -0.6164926 -2.122986 21
6: 4.147977e-03 0.5384341 -0.4763504 -2.104925 55
7: 7.226303e-04 0.6105269 -0.4253281 -2.081980 75
8: 9.404635e-03 0.5188481 -0.5195085 -2.011514 36
9: 1.801419e-02 0.4772708 -0.5205306 -1.950327 31
10: 5.866705e-04 0.6105269 -0.4095270 -1.941403 87